library(tidyverse)
library(here)
library(knitr)
library(DT)
library(kableExtra)Lab 9
Set - Up
Part 2
names <- read_csv(here::here("supporting_artifacts", "supporting_data", "StateNames_A.csv")) |>
rename(Sex = Gender)
names |>
datatable()Part 3
1.
In my first attempt at this problem, I forgot to include the output!
names_A <- names |>
mutate(CountA = ifelse(Name == "Allison", Count, 0))
names_A |>
group_by(State, Sex) |>
summarise(CountA = sum(CountA), .groups = "keep") |>
arrange(State) |>
kable() |>
scroll_box(width = "500px", height = "200px")| State | Sex | CountA |
|---|---|---|
| AK | F | 232 |
| AK | M | 0 |
| AL | F | 1535 |
| AL | M | 0 |
| AR | F | 1198 |
| AR | M | 0 |
| AZ | F | 1880 |
| AZ | M | 0 |
| CA | F | 12413 |
| CA | M | 0 |
| CO | F | 1594 |
| CO | M | 0 |
| CT | F | 1099 |
| CT | M | 0 |
| DC | F | 321 |
| DC | M | 0 |
| DE | F | 294 |
| DE | M | 0 |
| FL | F | 4455 |
| FL | M | 0 |
| GA | F | 3257 |
| GA | M | 0 |
| HI | F | 183 |
| HI | M | 0 |
| IA | F | 1477 |
| IA | M | 0 |
| ID | F | 451 |
| ID | M | 0 |
| IL | F | 5110 |
| IL | M | 0 |
| IN | F | 3067 |
| IN | M | 0 |
| KS | F | 1283 |
| KS | M | 0 |
| KY | F | 1905 |
| KY | M | 20 |
| LA | F | 1209 |
| LA | M | 0 |
| MA | F | 2218 |
| MA | M | 0 |
| MD | F | 2229 |
| MD | M | 0 |
| ME | F | 340 |
| ME | M | 0 |
| MI | F | 4014 |
| MI | M | 0 |
| MN | F | 2374 |
| MN | M | 0 |
| MO | F | 2882 |
| MO | M | 0 |
| MS | F | 817 |
| MS | M | 0 |
| MT | F | 226 |
| MT | M | 0 |
| NC | F | 3435 |
| NC | M | 0 |
| ND | F | 285 |
| ND | M | 0 |
| NE | F | 807 |
| NE | M | 0 |
| NH | F | 412 |
| NH | M | 0 |
| NJ | F | 3052 |
| NJ | M | 0 |
| NM | F | 399 |
| NM | M | 0 |
| NV | F | 729 |
| NV | M | 0 |
| NY | F | 5747 |
| NY | M | 0 |
| OH | F | 5487 |
| OH | M | 0 |
| OK | F | 1421 |
| OK | M | 0 |
| OR | F | 1186 |
| OR | M | 0 |
| PA | F | 4307 |
| PA | M | 0 |
| RI | F | 306 |
| RI | M | 0 |
| SC | F | 1228 |
| SC | M | 0 |
| SD | F | 376 |
| SD | M | 0 |
| TN | F | 2488 |
| TN | M | 0 |
| TX | F | 10192 |
| TX | M | 0 |
| UT | F | 1125 |
| UT | M | 0 |
| VA | F | 3220 |
| VA | M | 0 |
| VT | F | 135 |
| VT | M | 0 |
| WA | F | 1956 |
| WA | M | 0 |
| WI | F | 2367 |
| WI | M | 0 |
| WV | F | 813 |
| WV | M | 0 |
| WY | F | 142 |
| WY | M | 0 |
2.
names_A <- names_A |>
filter(Sex == "F")3.
Originally, I thought it would be simpler to filter by name, rather than filtering by count and sex because there are no male Allisons. However, explicitly filtering by sex is clearer for the reader and is more thorough!
names_year <- names_A |>
group_by(Year) |>
summarise(Count = sum(Count), .groups = "keep")
ggplot(data = names_year, aes(x = Year, y = Count)) +
geom_col(fill = "#bf5850") +
theme_light() Part 4
4.
model_1 <- names_year |>
lm(Count ~ Year, data = _)
kable(broom::tidy(model_1))| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | -5359999.951 | 992171.5797 | -5.402291 | 5.87e-05 |
| Year | 2799.631 | 494.7236 | 5.658979 | 3.56e-05 |
5.
ggplot(data = names_year, aes(x = Year, y = Count)) +
geom_point(color = "#bf5850") +
geom_smooth(method = "lm", color = "#f7b034") +
theme_light()6.
In my first attempt, I included epsilon because I have been so used to adding it when representing the results of a linear regression. However, because we are estimating the mean, the error is already accounted for and is, thus, not needed.
\(\widehat{Allisons} = -5359999.95 + 2799.63year\)
7.
In my first attempt, I did not make a comment on the residuals (just an oversight). I have now included my observations of the pattern of the residuals.
ggplot(data = model_1, aes(x = .resid)) +
geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "#bf5850") +
geom_density(color = "#f7b034", lwd = 1) +
theme_light() +
labs(x = "Residuals", y = "Density")The residuals are right skewed but have a mean around 0. Because the residuals are the actual - predicted value, we would hope to see residuals centered around 0, as that would indicate the regression doesn’t consistently overestimate or underestimate values. However, the underestimates seem to be larger, than the underestimates.
8.
The coefficient on year is positive, meaning that each year, the number of Allisons, or the popularity of the name “Allison,” is actually predicted to increase.
Part 5
1.
I, at first, used geom_col to plot the Allen/Alan/Allan observations. However, I should have used geom_bar because geom_bar makes the column heights proportional to the number of observations, which would better represent the popularity of the names relative to each other over time.
names_Allen <- names |>
filter(Sex == "M",
Name %in% c("Allen", "Allan", "Alan"))
names_Allen_count <- names_Allen |>
group_by(Year, Name) |>
summarise(Count = sum(Count))`summarise()` has grouped output by 'Year'. You can override using the
`.groups` argument.
ggplot(data = names_Allen_count, aes(x = Year, y = Count)) +
geom_line(aes(color = Name)) +
theme_light() +
scale_fill_brewer(palette = "Paired") +
labs(y = "Count")2.
names_Allen_sum <- names_Allen |>
filter(Year == 2000,
State %in% c("CA", "PA")) |>
group_by(Name, State) |>
summarise(Count = sum(Count)) |>
pivot_wider(names_from = Name, values_from = Count)
kable(names_Allen_sum)| State | Alan | Allan | Allen |
|---|---|---|---|
| CA | 579 | 131 | 176 |
| PA | 51 | 12 | 56 |
3.
names_Allen_sum |>
mutate(Total = Alan + Allan + Allen,
across(.cols = Alan:Allen, .fns = ~./Total)) |>
select(-Total) |>
kable() |>
kable_paper(c("striped", "hover"), html_font = "Helvetica") |>
column_spec(2:4, bold = TRUE, italic = TRUE, monospace = TRUE)| State | Alan | Allan | Allen |
|---|---|---|---|
| CA | 0.6534989 | 0.1478555 | 0.1986456 |
| PA | 0.4285714 | 0.1008403 | 0.4705882 |